In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import re

from os import listdir
from os.path import isfile, join
from sklearn.metrics import mean_squared_error

import os

import requests
from bs4 import BeautifulSoup
import urllib.parse

In [2]:
df_1 = pd.read_csv('../data/yeast/GSE145936_Sis1-AA_Gene_counts_normalized.txt', sep='\t', index_col=0)
df_2 = pd.read_csv('../data/yeast/GSE153609_gene_expression_TPM_all_times.csv', index_col=0)
df_3 = pd.read_csv('../data/yeast/GSE168699_RNA_TPM_all_times.csv', index_col=0)

In [3]:
to_drop = df_3.columns[:7]
df_3 = df_3.drop(labels=to_drop, axis=1)
df_1 = df_1.drop(labels=['gene name'], axis=1)

In [4]:
common_genes = set(df_1.index).intersection(set(df_2.index)).intersection(set(df_3.index))
common_genes = list(common_genes)

In [None]:
len(common_genes)

In [None]:
df_1 = df_1.loc[common_genes]
df_2 = df_2.loc[common_genes]
df_3 = df_3.loc[common_genes]

In [None]:
df_1.to_csv('temp.csv')

In [None]:
normalized_df_1=(df_1-df_1.min())/(df_1.max()-df_1.min())
normalized_df_2=(df_2-df_2.min())/(df_2.max()-df_2.min())
normalized_df_3=(df_3-df_3.min())/(df_3.max()-df_3.min())

In [None]:
normalized_df_1 = normalized_df_1*100.0
normalized_df_2 = normalized_df_2*100.0
normalized_df_3 = normalized_df_3*100.0

In [None]:
test_df_1 = normalized_df_1.iloc[:,[3,4,5,9,10,11]]
test_df_2 = normalized_df_2.iloc[:,[3,4,5]]
test_df_3 = normalized_df_3.iloc[:, -5:]

In [None]:
test_exp = pd.concat([test_df_1, test_df_2, test_df_3], axis=1)


In [None]:
test_source = test_exp.iloc[:,[0,1,3,4,6,7,9,10,11,12]]
test_target = test_exp.iloc[:,[0,1,3,4,6,7,9,10,11,12]]

In [None]:
train_source_df_1 = normalized_df_1.iloc[:, [0,1,2,3,5,6,7,8,9]]
train_target_df_1 = normalized_df_1.iloc[:, [1,2,3,4,6,7,8,9,10]]
train_source_df_2 = normalized_df_2.iloc[:, [0,1,2,3]]
train_target_df_2 = normalized_df_2.iloc[:, [1,2,3,4]]
train_source_df_3 = normalized_df_3.iloc[:, :-4]
train_target_df_3 = normalized_df_3.iloc[:, 1:-3]

In [None]:
source_exp = pd.concat([train_source_df_1, train_source_df_2, train_source_df_3], axis=1)
target_exp = pd.concat([train_target_df_1, train_target_df_2, train_target_df_3], axis=1)

In [None]:
target_gene = 'YHR044C'
neg_tf = pd.read_csv('./YHR044C_neg.csv', sep=';', names=range(4), index_col=0)
pos_tf = pd.read_csv('./YHR044C_pos.csv', sep=';', names=range(4), index_col=0)

pd.Series(list(set(neg_tf[2].values).difference(set(pos_tf[2].values)))).to_csv('neg_temp.csv', index=False, header=False)
pd.Series(list(set(pos_tf[2].values).difference(set(neg_tf[2].values)))).to_csv('pos_temp.csv', index=False, header=False)



In [None]:
neg_tf = pd.read_csv('./neg_temp.csv',index_col=0)
pos_tf = pd.read_csv('./pos_temp.csv',index_col=0)

In [None]:
all_tf = np.concatenate([neg_tf.index, pos_tf.index])

In [None]:
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

regr = RandomForestRegressor(random_state=42, oob_score=True)
lregr = LinearRegression()
# regr = xgb.XGBRegressor(random_state=42)

In [None]:
regr.fit(source_exp.loc[all_tf].T, target_exp.loc[target_gene].T)
lregr.fit(source_exp.loc[all_tf].T, target_exp.loc[target_gene].T)

In [None]:
lregr.score(test_source.loc[all_tf].T, test_target.loc[target_gene].T)

In [None]:
regr.score(test_source.loc[all_tf].T, test_target.loc[target_gene].T)

In [None]:
mean_squared_error(lregr.predict(test_source.loc[all_tf].T), test_target.loc[target_gene].T)

In [None]:
mean_squared_error(regr.predict(test_source.loc[all_tf].T), test_target.loc[target_gene].T)

In [32]:
gene_name_df = pd.read_csv('./gene_name_switch.csv', index_col=0)

In [7]:
request_url = 'http://www.yeastract.com/findregulators.php'
requeset_header = {
    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    "accept-language": "en,en-US;q=0.9",
    "cache-control": "max-age=0",
    "content-type": "application/x-www-form-urlencoded",
    "upgrade-insecure-requests": "1",
    "Referer": "http://www.yeastract.com/formfindregulators.php",
    "Referrer-Policy": "strict-origin-when-cross-origin"
  }

queryStr = 'type=doc&t_pos=true&t_neg=true&use_na=true&evidence=plus&image=Y&genes=YMR182C%0D%0AYBR083W%0D%0AYGL013C%0D%0AYHL020C%0D%0AYCL055W%0D%0AYHR124W%0D%0AYOR162C%0D%0AYJR127C%0D%0AYGL073W%0D%0AYFL031W%0D%0AYNL314W%0D%0AYJR060W%0D%0A&biggroup=0&subgroup=0&doc-species=0&synteny=0&pot-species=0&submit=Search'
query_data = urllib.parse.parse_qs(queryStr)

In [15]:
tf_res_list = []
target_gene_list = []

In [21]:
target_gene_chunks = np.array_split(np.array(common_genes), 50)

In [22]:
for target_gene_chunk in tqdm(target_gene_chunks):
    request_genes_string = '\r\n'.join(target_gene_chunk)
    query_data['genes'][0] = request_genes_string
    r = requests.post(
        url=request_url,
        data=query_data,
        headers=requeset_header
    )
    soup = BeautifulSoup(r.content,'lxml')
    tables = soup.find_all('table', {"summary": "main"})
    df_list = pd.read_html(str(tables),index_col=0)
    df = df_list[0]
    for index, row in df.iterrows():
        target_gene = index.split('/')[0]
        target_gene_list.append(target_gene)
        tf_res_array = row.iloc[0]
        tf_res_array += ' '
        tf_list = tf_res_array.upper().split('  -  REFERENCE ')[:-1]
        converted_tf_list = []
        for tf in tf_list:
            if tf[-1] == 'P' : tf = tf[:-1]
            if (tf in gene_name_df.index):
                converted_tf_list.append(gene_name_df.loc[tf]['ORF_name'])
            elif (tf in gene_name_df['ORF_name']):
                converted_tf_list.append(tf)
        tf_res_list.append('; '.join(converted_tf_list))

100%|██████████| 50/50 [01:23<00:00,  1.67s/it]


In [30]:
out_df = pd.DataFrame(index=target_gene_list)
out_df['tf_list'] = tf_res_list

In [31]:
out_df.to_csv('yeat_network.csv')