In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Library import
import pandas as pd
import re

from minimal_set import MinimalSetCalc


In [3]:
# load all expression data
ts_df = pd.read_csv('data/GSE97500/normalized/all.csv.gz', compression='gzip', index_col=0)
ts_df.head()

Unnamed: 0,AT4G14695,AT1G63110,AT3G49530,AT1G34550,AT1G79120,AT2G05642,AT5G49850,AT5G23180,AT4G34170,AT2G22560,...,AT3G52230,AT5G01630,AT4G19140,AT2G44050,AT1G33260,AT4G25430,AT1G01840,AT5G45780,AT1G03030,AT3G18850
R5C-1,-0.259642,-0.047704,0.268901,-0.156076,-0.263408,-0.271359,-0.070127,-0.271798,-0.271798,-0.216495,...,0.58443,-0.251601,-0.084589,0.101142,0.006748,-0.271277,-0.261523,-0.234499,-0.168821,-0.216008
R5C-2,-0.196832,-0.099989,-0.057585,-0.173159,-0.195185,-0.19684,-0.010986,-0.197009,-0.197009,-0.157582,...,0.398797,-0.18425,-0.190234,0.056801,-0.184529,-0.196875,-0.172338,-0.178381,-0.179526,-0.192503
R5C-3,-0.065676,-0.035098,-0.027644,-0.059451,-0.065182,-0.065767,-0.015416,-0.065767,-0.065767,-0.055447,...,0.116732,-0.062351,-0.061978,0.008582,-0.062762,-0.065698,-0.061705,-0.060286,-0.062971,-0.063316
R10C-1,-0.226435,-0.018075,0.354094,-0.122074,-0.235026,-0.244525,0.117238,-0.244752,-0.244752,-0.189135,...,0.451323,-0.213386,-0.036578,0.043757,-0.020142,-0.242229,-0.229832,-0.192828,-0.135687,-0.191709
R10C-2,-0.159971,-0.116181,-0.067596,-0.140832,-0.159247,-0.160476,-0.082394,-0.160545,-0.160545,-0.14183,...,0.321067,-0.148548,-0.141449,0.006088,-0.148283,-0.160327,-0.128911,-0.154555,-0.14718,-0.158174


In [4]:
# parsing data entry names, retriving all time points
ts_time_steps = set()
pattern = r'^([A-Z])(\d+)([A-Z])-(\d+)$'
for ind in ts_df.index:
    ts_time_steps.add(int(re.match(pattern, ind).groups()[1]))
ts_time_steps = sorted(ts_time_steps)
print(ts_time_steps)
len(ts_time_steps)

[5, 10, 15, 20, 30, 45, 60, 90, 120]


9

In [5]:
time_list = []
next_exp_entries_list = []
is_end_list = []
is_test_list = []
for ind in ts_df.index:
    re_match = re.match(pattern, ind).groups()
    current_time = int(re_match[1])
    current_time_i = ts_time_steps.index(current_time)
    time_list.append(current_time)
    # getting next step information
    if (current_time_i + 1 < len(ts_time_steps)):
        next_time = ts_time_steps[current_time_i + 1]
        netx_exp_entries = ["{}{}{}-{}".format(re_match[0], next_time, re_match[2], rep) for rep in ['1', '2', '3']]
        next_exp_entries_list.append(';'.join(netx_exp_entries))
        is_end_list.append(0)
        # first 2 thirds of the series as training
        if current_time_i < len(ts_time_steps)*2/3:
            is_test_list.append(0)
        else:
            is_test_list.append(1)
    # handling tails of the series
    else:
        next_exp_entries_list.append('')
        is_end_list.append(1)
        is_test_list.append(0)

In [6]:
ts_meta_df = pd.DataFrame(index=ts_df.index)
ts_meta_df['time'] = time_list
ts_meta_df['next_exp_entries'] = next_exp_entries_list
ts_meta_df['is_end'] = is_end_list
ts_meta_df['is_test'] = is_test_list

In [7]:
ts_meta_df.head()

Unnamed: 0,time,next_exp_entries,is_end,is_test
R5C-1,5,R10C-1;R10C-2;R10C-3,0,0
R5C-2,5,R10C-1;R10C-2;R10C-3,0,0
R5C-3,5,R10C-1;R10C-2;R10C-3,0,0
R10C-1,10,R15C-1;R15C-2;R15C-3,0,0
R10C-2,10,R15C-1;R15C-2;R15C-3,0,0


In [22]:
ts_meta_df.to_csv('./data/GSE97500/ts_meta.csv')

In [8]:
#  loading target and TF lists
tf_list_df = pd.read_csv('./data/GSE97500/arabidopsis_tf.csv', names=['tf'])
tf_list = tf_list_df['tf'].values

target_list_df = pd.read_csv('./data/GSE97500/arabidopsis_target.csv', names=['target'])
target_list = target_list_df['target'].values

In [9]:
cpu_cores = 10
X = ts_df[tf_list]
y = ts_df[target_list[6:10]]

In [12]:
y

Unnamed: 0,AT5G64040,AT1G08930,AT5G44340,AT1G64190
R5C-1,-0.173618,2.661913,2.997918,1.376483
R5C-2,-0.195131,0.259772,2.968871,2.031474
R5C-3,-0.065526,0.145995,0.917564,0.699066
R10C-1,-0.202375,2.686069,2.603019,0.669520
R10C-2,-0.159972,0.133264,2.208217,0.429023
...,...,...,...,...
S90N-2,1.638912,0.421286,2.631945,6.169318
S90N-3,1.259574,0.377123,2.851782,4.125264
S120N-1,1.409987,0.307143,2.971191,3.589296
S120N-2,1.342331,0.479385,3.721561,4.772678


In [10]:
# calculation setup
test_run = MinimalSetCalc(X, y, target_list[6:10], 
                          cpu_cores=cpu_cores, 
                          num_iterations=100, 
                          feature_keep_rate=0.5, 
                          is_ts=True, 
                          output_dir='./output/arabidopsis_ts/',
                          ts_meta_df='data/GSE97500/ts_meta.csv')

AttributeError: 'numpy.ndarray' object has no attribute 'loc'

In [11]:
test_run.y_list

NameError: name 'test_run' is not defined