Formatting sequence data to form target/source time series, then splits into training and testing sets

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import re
import glob
import os
from functools import cmp_to_key, partial
from itertools import product

# regex for number extraction from string
number_pattern =  r'(-?(?:0|[1-9]\d*)(?:\.\d+)?(?:[eE][+-]?\d+)?)'

In [2]:
def train_test_source_target_split(df, time_set, rep_set, num_of_test_per_series=1):
    sorted_time_list = sorted(time_set, key=lambda time_str: float(re.search(number_pattern, time_str).group(1)))

    train_source_time = sorted_time_list[:-1-num_of_test_per_series]
    train_target_time = sorted_time_list[1:0-num_of_test_per_series]
    test_source_time = sorted_time_list[-1-num_of_test_per_series:-1]
    test_target_time = sorted_time_list[0-num_of_test_per_series:]

    train_source_list = ['@'.join([x, y]) for x, y in product(list(rep_set), train_source_time)]
    train_target_list = ['@'.join([x, y]) for x, y in product(list(rep_set), train_target_time)]
    test_source_list = ['@'.join([x, y]) for x, y in product(list(rep_set), test_source_time)]
    test_target_list = ['@'.join([x, y]) for x, y in product(list(rep_set), test_target_time)]
    return df[train_source_list], df[train_target_list], df[test_source_list], df[test_target_list]

In [3]:
def time_course_compare(item1, item2, valued_time=False):
    rep1, course1 = item1.split('@')
    rep1 = rep1+course1[-1]
    course1 = course1[:-1]
    rep2, course2 = item2.split('@')
    rep2 = rep2+course2[-1]
    course2 = course2[:-1]
    if (valued_time):
        course1 = float(re.search(number_pattern, course1).group(1))
        course2 = float(re.search(number_pattern, course2).group(1))
    if rep1 < rep2:
        return -1
    elif rep1 > rep2:
        return 1
    else:
        if course1 < course2:
            return -1
        elif course1 > course2:
            return 1
        else:
            return 0

In [4]:
gene_meta_df = pd.read_csv('./gene_meta.tsv', sep='\t', index_col=1)
gene_meta_df.head(3)

Unnamed: 0_level_0,id,title,essential,Gene->Coordinates,cog
locus,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BSU_17890,000E237B00539E6850B6A50EE01BA02844B6ACBC,tkt,no,"1,919,861 1,921,864",COG0021
BSU_20290,001BF5429FA86D39BC64447929BC20FCBD988C69,yorQ,no,"2,173,956 2,174,111",
BSU_21700,0021EF5F0934B4D6A1A52C5B34B36D135E6D8CE8,ypoP,no,"2,288,194 2,288,619",COG1846


In [5]:
rep_set = set()
course_set = set()
gene_set = set()
for filepath in glob.iglob('./GSE108659/*.gz'):
    filename = os.path.basename(filepath) 
    filename = filename.split('.')[0]
    if len(filename.split('_')) > 2:
        df = pd.read_csv(filepath, compression='gzip', sep='\t', index_col=0)
        df = df.dropna()
        filename_parts = (filename.split('_'))
        rep_set.add(filename_parts[1])
        course_set.add(filename_parts[2])
        if len(gene_set) > 0:
            gene_set = gene_set.intersection(set(df['locus']))
        else:
            gene_set = set(df['locus'])

In [6]:
formated_df = pd.DataFrame(index=list(gene_set))
gene_list = list(gene_set)

In [7]:
for filepath in glob.iglob('./GSE108659/*.gz'):
    filename = os.path.basename(filepath) 
    filename = filename.split('.')[0]
    if len(filename.split('_')) > 2:
        df = pd.read_csv(filepath, compression='gzip', sep='\t', index_col=1)
        df = df.dropna()
        filename_parts = (filename.split('_'))
        rep_name = filename_parts[1]
        course_name = filename_parts[2]
        formated_df["{}@{}".format(rep_name, course_name)] = df.loc[gene_list]['norm.rpkm'].values


In [8]:
formated_df.head(3)

Unnamed: 0,4146@T5B,4146@T4A,4140@T8B,425@T4B,HR3@T4A,425@T5A,4146@T5A,4140@T3A,4146@T3A,425@T0B,...,425@T8B,HR3@T2A,HR3@T5B,425@T6B,4146@T1B,4146@T7B,425@T8A,425@T2A,4140@T4B,HR3@T4B
BSU02619,79.8,0.0,13.0,0.0,0.0,0.0,0.0,12.5,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,29.0,0.0,0.0,0.0,0.0
BSU17930,1006.4,1336.2,0.0,0.0,1804.0,0.0,1183.3,4.1,1623.3,1.6,...,205.3,3450.8,2206.0,20.8,779.8,948.6,62.2,116.2,1.3,2280.8
BSU19880,0.0,10.0,0.0,0.0,47.7,0.0,0.0,0.0,15.2,0.0,...,0.0,0.0,0.0,0.0,0.0,18.4,0.0,0.0,0.0,0.0


In [9]:
sorted_rep_course = sorted(formated_df.columns, key=cmp_to_key(time_course_compare))
formated_df = formated_df[sorted_rep_course]

In [10]:
rep_set

{'4140', '4146', '425', 'HR3'}

In [11]:
formated_index_list = []
for ind in formated_df.index:
    if '_' not in ind:
        formated_index_list.append('BSU_'+ind[3:])
    else:
        formated_index_list.append(ind)
formated_df.index = formated_index_list

known_names = []
known_index = []
for name in formated_df.index:
    if name in list(gene_meta_df.index):
        known_index.append(name)
        known_names.append(gene_meta_df.loc[name,'title'])

formated_df = formated_df.loc[known_index]
formated_df.index = known_names

In [12]:
# splitting source/target in time series
target_index_list = [b%7 > 0 for b in list(range(56))]
source_index_list = [b%7 < 6 for b in list(range(56))]
formated_source_df = formated_df.loc[:, source_index_list]
formated_target_df = formated_df.loc[:, target_index_list]

# train/test split
train_index_list = [b%6 < 5 for b in list(range(48))]
test_index_list = [b%6 >= 5 for b in list(range(48))]
train_source_df = formated_source_df.loc[:, train_index_list]
test_source_df = formated_source_df.loc[:, test_index_list]
train_target_df = formated_target_df.loc[:, train_index_list]
test_target_df = formated_target_df.loc[:, test_index_list]



In [13]:
formated_df.to_csv('./GSE108659/formated/all.csv')
formated_source_df.to_csv('./GSE108659/formated/source.csv')
formated_target_df.to_csv('./GSE108659/formated/target.csv')

train_source_df.to_csv('./GSE108659/formated/train_source.csv')
train_target_df.to_csv('./GSE108659/formated/train_target.csv')
test_source_df.to_csv('./GSE108659/formated/test_source.csv')
test_target_df.to_csv('./GSE108659/formated/test_target.csv')


In [14]:
df = pd.read_csv('./GSE128875/exp_mat.csv', index_col=0)
df = df.drop(columns=['gene_name_alt'], axis=1)
df = df[~df.index.duplicated()]
formated_columns = ['{}@{}'.format(column.split('_')[1], column.split('_')[0]) for column in df.columns]
df.columns = formated_columns

In [15]:
sorted_rep_course = sorted(df.columns, key=cmp_to_key(partial(time_course_compare, valued_time=True)))
formated_df = df[sorted_rep_course]

# splitting source/target in time series
target_index_list = [b%7 > 0 for b in list(range(28))]
source_index_list = [b%7 < 6 for b in list(range(28))]
formated_source_df = formated_df.loc[:, source_index_list]
formated_target_df = formated_df.loc[:, target_index_list]

# train/test split
train_index_list = [b%6 < 5 for b in list(range(24))]
test_index_list = [b%6 >= 5 for b in list(range(24))]
train_source_df = formated_source_df.loc[:, train_index_list]
test_source_df = formated_source_df.loc[:, test_index_list]
train_target_df = formated_target_df.loc[:, train_index_list]
test_target_df = formated_target_df.loc[:, test_index_list]

In [16]:
formated_df.to_csv('./GSE128875/formated/all.csv')
formated_source_df.to_csv('./GSE128875/formated/source.csv')
formated_target_df.to_csv('./GSE128875/formated/target.csv')

train_source_df.to_csv('./GSE128875/formated/train_source.csv')
train_target_df.to_csv('./GSE128875/formated/train_target.csv')
test_source_df.to_csv('./GSE128875/formated/test_source.csv')
test_target_df.to_csv('./GSE128875/formated/test_target.csv')


In [17]:
rep_set = set()
course_set = set()
gene_set = set()
for filepath in glob.iglob('./GSE224332/*.gz'):
    filename = os.path.basename(filepath) 
    filename = filename.split('.')[0]
    df = pd.read_csv(filepath, compression='gzip',  index_col=1)
    df = df.dropna()
    df = df[~df.index.duplicated()]
    filename_parts = (filename.split('_'))
    rep_set.add(filename_parts[2])
    course_set.add(filename_parts[3])
    if len(gene_set) > 0:
        gene_set = gene_set.intersection(set(df.index))
    else:
        gene_set = set(df.index)

In [18]:
formated_df = pd.DataFrame(index=list(gene_set))
gene_list = list(gene_set)

In [19]:
for filepath in glob.iglob('./GSE224332/*.gz'):
    filename = os.path.basename(filepath) 
    filename = filename.split('.')[0]
    if len(filename.split('_')) > 2:
        df = pd.read_csv(filepath, compression='gzip', index_col=1)
        df = df.dropna()
        df = df[~df.index.duplicated()]
        filename_parts = (filename.split('_'))
        rep_name = filename_parts[2]
        course_name = filename_parts[3]
        formated_df["{}@{}".format(rep_name, course_name)] = df.loc[gene_list]['edgeR-normalized count (log2)'].values


In [20]:
train_source_df, train_target_df, test_source_df, test_target_df = train_test_source_target_split(formated_df, course_set, rep_set, 2)

In [21]:
source_df = pd.concat([train_source_df, test_source_df], axis=1)
target_df = pd.concat([train_target_df, test_target_df], axis=1)

In [22]:
formated_df.to_csv('./GSE224332/formated/all.csv')
source_df.to_csv('./GSE224332/formated/source.csv')
target_df.to_csv('./GSE224332/formated/target.csv')

train_source_df.to_csv('./GSE224332/formated/train_source.csv')
train_target_df.to_csv('./GSE224332/formated/train_target.csv')
test_source_df.to_csv('./GSE224332/formated/test_source.csv')
test_target_df.to_csv('./GSE224332/formated/test_target.csv')
