In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import re
import glob
import os
from functools import cmp_to_key, partial
from itertools import product
from scipy import stats
import gzip
from Bio import SeqIO


# regex for number extraction from string
number_pattern =  r'(-?(?:0|[1-9]\d*)(?:\.\d+)?(?:[eE][+-]?\d+)?)'


In [2]:
exp_df = pd.read_csv('./GSE115553_ProcessedMatrixFile.csv', index_col=0)
exp_df.head(2)

Unnamed: 0,Dnmt3a_LSB52_0h_repA,Dnmt3a_LSB53_0h_repA,Dnmt3a_LSB52_0h_repB,Dnmt3a_LSB53_0h_repB,Dnmt3a_LSB52_2h_repA,Dnmt3a_LSB53_2h_repA,Dnmt3a_LSB52_2h_repB,Dnmt3a_LSB53_2h_repB,Keap1_LSB28_0h_repA,Keap1_LSB29_0h_repA,...,Nos2_LSB26_18h_repB,Nos2_LSB27_18h_repB,Tmem258_LSB30_6h_repA,Tmem258_LSB31_6h_repA,Tmem258_LSB30_6h_repB,Tmem258_LSB31_6h_repB,Tmem258_LSB30_18h_repA,Tmem258_LSB31_18h_repA,Tmem258_LSB30_18h_repB,Tmem258_LSB31_18h_repB
Fth1,5833.61,21109.88,5761.22,25680.01,7560.93,22720.71,8658.8,20374.68,33433.17,34159.48,...,49370.82,23860.32,14783.14,33860.23,14833.94,30887.65,15279.83,48270.76,13148.89,45032.11
Rn45s,14528.97,13527.62,16642.61,17926.43,18181.65,15706.77,12196.8,13436.17,13538.74,11889.42,...,17722.43,18624.35,18490.93,22965.37,26396.97,31177.56,19355.0,18919.87,19288.53,19262.36


In [3]:
time_set = set()
rep_set = set()
formated_names = []
exp_dict = {}
for name in exp_df.columns:
    exp_pattern = r'(.*)_(\d+)h_(.*)'
    match = re.search(exp_pattern, name)
    rep_name = match.group(1)+'_'+match.group(3)
    rep_set.add(rep_name)
    time = int(match.group(2))
    time_set.add(time)
    if rep_name in exp_dict:
        exp_dict[rep_name].append(time)
    else:
        exp_dict[rep_name] = []
        exp_dict[rep_name].append(time)
    formated_names.append(rep_name+'@'+match.group(2))

In [4]:
exp_df.columns = formated_names
rep_list = list(rep_set)
time_list = list(time_set)
time_list.sort()

In [5]:
for key in exp_dict.keys():
    if len(exp_dict[key]) < 4:
        rep_list.remove(key)

In [6]:
train_source_idx = [i[0]+'@'+str(i[1]) for i in product(rep_list, time_list[0:2])]
train_target_idx = [i[0]+'@'+str(i[1]) for i in product(rep_list, time_list[1:3])]
test_source_idx = [i[0]+'@'+str(i[1]) for i in product(rep_list, time_list[-2:-1])]
test_target_idx = [i[0]+'@'+str(i[1]) for i in product(rep_list, time_list[-1:])]

In [7]:
normalized_df = exp_df.apply(stats.zscore, axis=0)

In [9]:
normalized_df[train_source_idx].to_csv('./normalized/train_source.csv.gz', compression='gzip')
normalized_df[train_target_idx].to_csv('./normalized/train_target.csv.gz', compression='gzip')
normalized_df[test_source_idx].to_csv('./normalized/test_source.csv.gz', compression='gzip')
normalized_df[test_target_idx].to_csv('./normalized/test_target.csv.gz', compression='gzip')