Scripts for regression experiments on mouse

In [1]:
import pandas as pd
from itertools import product
import re


from scipy import stats

# regex for number extraction from string
number_pattern =  r'(-?(?:0|[1-9]\d*)(?:\.\d+)?(?:[eE][+-]?\d+)?)'


In [2]:
# ts training/testing data curation 

df_1 = pd.read_csv('./GSE145936/GSE145936_Sis1-AA_Gene_counts_normalized.txt.gz', sep='\t', index_col=0, compression='gzip')
df_2 = pd.read_csv('./GSE153609/GSE153609_gene_expression_TPM_all_times.csv.gz', index_col=0, compression='gzip')
df_3 = pd.read_csv('./GSE168699/GSE168699_RNA_TPM_all_times.csv.gz', index_col=0, compression='gzip')
df_4_1 = pd.read_csv('./GSE226769/GSE226769_Meiotic_Depletion_TPMs.csv.gz', compression='gzip', index_col=0)
df_4_2 = pd.read_csv('./GSE226769/GSE226769_Mitotic_Depletion_TPMs.csv.gz', compression='gzip', index_col=0)
df_4_3 = pd.read_csv('./GSE226769/GSE226769_UME6_T99N_AltAD_Rescue_TPMs.csv.gz', compression='gzip', index_col=0)
df_4_4 = pd.read_csv('./GSE226769/GSE226769_UME6_T99N_Rescue_TPMs.csv.gz', compression='gzip', index_col=0)

to_drop = df_3.columns[:7]
df_3 = df_3.drop(labels=to_drop, axis=1)
df_1 = df_1.drop(labels=['gene name'], axis=1)

common_genes = set(df_1.index).intersection(set(df_2.index)).intersection(set(df_3.index))
common_genes = list(common_genes)




In [3]:
def train_test_source_target_split(df, time_set, rep_set):
    sorted_time_list = sorted(time_set, key=lambda time_str: float(re.search(number_pattern, time_str).group(1)))
    print(sorted_time_list)

    train_source_time = sorted_time_list[:-2]
    train_target_time = sorted_time_list[1:-1]
    test_source_time = sorted_time_list[-2:-1]
    test_target_time = sorted_time_list[-1:]

    train_source_list = ['@'.join([x, y]) for x, y in product(list(rep_set), train_source_time)]
    train_target_list = ['@'.join([x, y]) for x, y in product(list(rep_set), train_target_time)]
    test_source_list = ['@'.join([x, y]) for x, y in product(list(rep_set), test_source_time)]
    test_target_list = ['@'.join([x, y]) for x, y in product(list(rep_set), test_target_time)]
    return df[train_source_list], df[train_target_list], df[test_source_list], df[test_target_list]

def format_index_and_normalize(df):
    normalized_df = df.apply(stats.zscore, axis=0)
    selected_index_list = []
    new_index_list = []
    for index_name in df.index:
        names = index_name.split('Name=')[-1]
        if ('/' in names):
            name_list = names.split('/')[:2]
            for name in name_list:
                if (name in common_genes):
                    selected_index_list.append(index_name)
                    new_index_list.append(name)
                    continue
        elif (names in common_genes):
            selected_index_list.append(index_name)
            new_index_list.append(names)
            continue
    len(new_index_list)
    new_df = normalized_df.loc[selected_index_list]
    new_df.index = new_index_list
    return new_df

In [4]:
df_4_1 = format_index_and_normalize(df_4_1)
df_4_2 = df_4_2.drop(columns=df_4_2.columns[-6:])
df_4_2 = format_index_and_normalize(df_4_2)
df_4_3 = format_index_and_normalize(df_4_3)
df_4_4 = format_index_and_normalize(df_4_4)

common_genes = set(common_genes).intersection(set(df_4_1.index), set(df_4_2.index), set(df_4_3.index), set(df_4_4.index))
common_genes = list(common_genes)

df_4_1 = df_4_1.loc[common_genes]
df_4_2 = df_4_2.loc[common_genes]
df_4_3 = df_4_3.loc[common_genes]
df_4_4 = df_4_4.loc[common_genes]
df_4_1 = df_4_1[~df_4_1.index.duplicated(keep='first')]
df_4_2 = df_4_2[~df_4_2.index.duplicated(keep='first')]
df_4_3 = df_4_3[~df_4_3.index.duplicated(keep='first')]
df_4_4 = df_4_4[~df_4_4.index.duplicated(keep='first')]


In [5]:
rep_set = set()
time_set = set()
formated_name_list = []
for name in df_4_1.columns:
    name_segments = name.split('_')
    rep_name = ''.join(name_segments[:-1])
    time_name = name_segments[-1]
    rep_set.add(rep_name)
    time_set.add(time_name)
    formated_name_list.append(rep_name+'@'+time_name)
print(len(rep_set), len(time_set))
df_4_1.columns = formated_name_list
normalized_df = df_4_1.apply(stats.zscore, axis=0)
df_split_1 = train_test_source_target_split(normalized_df, time_set, rep_set)

df_split_1[0].to_csv('./GSE226769/normalized/train_source_1.csv.gz', compression='gzip')
df_split_1[1].to_csv('./GSE226769/normalized/train_target_1.csv.gz', compression='gzip')
df_split_1[2].to_csv('./GSE226769/normalized/test_source_1.csv.gz', compression='gzip')
df_split_1[3].to_csv('./GSE226769/normalized/test_target_1.csv.gz', compression='gzip')


6 6
['0.5HR', '2HR', '2.5HR', '3HR', '4.5HR', '6HR']


In [6]:
rep_set = set()
time_set = set()
formated_name_list = []
for name in df_4_2.columns:
    pattern = r'(.*)\(([^()]*)\)(.*)' 
    match = re.match(pattern, name)
    rep_name = match.group(1) + match.group(3)
    rep_set.add(rep_name)
    time_name = match.group(2)
    time_set.add(time_name)
    formated_name_list.append(rep_name+'@'+time_name)
df_4_2.columns = formated_name_list
normalized_df = df_4_2.apply(stats.zscore, axis=0)
df_split_2 = train_test_source_target_split(normalized_df, time_set, rep_set)

df_split_2[0].to_csv('./GSE226769/normalized/train_source_2.csv.gz', compression='gzip')
df_split_2[1].to_csv('./GSE226769/normalized/train_target_2.csv.gz', compression='gzip')
df_split_2[2].to_csv('./GSE226769/normalized/test_source_2.csv.gz', compression='gzip')
df_split_2[3].to_csv('./GSE226769/normalized/test_target_2.csv.gz', compression='gzip')

['-30min', '0min', '15min', '30min', '60min', '120min']


In [7]:
rep_set = set()
time_set = set()
formated_name_list = []
for name in df_4_3.columns:
    name_segments = name.split('_')
    if (len(name_segments) == 2):
        rep_name = name_segments[0]
        time_name = name_segments[1]
    elif (len(name_segments) == 3):
        rep_name = name_segments[0] + '~' + name_segments[2]
        time_name = name_segments[1]
    elif (len(name_segments) == 4):
        rep_name = name_segments[0] + '~' + name_segments[1] + '~' + name_segments[3]
        time_name = name_segments[2]
    rep_set.add(rep_name)
    time_set.add(time_name)
    formated_name_list.append(rep_name+'@'+time_name)   
print(len(rep_set), len(time_set))
df_4_3.columns = formated_name_list
normalized_df = df_4_3.apply(stats.zscore, axis=0)
df_split_3 = train_test_source_target_split(normalized_df, time_set, rep_set)

df_split_3[0].to_csv('./GSE226769/normalized/train_source_3.csv.gz', compression='gzip')
df_split_3[1].to_csv('./GSE226769/normalized/train_target_3.csv.gz', compression='gzip')
df_split_3[2].to_csv('./GSE226769/normalized/test_source_3.csv.gz', compression='gzip')
df_split_3[3].to_csv('./GSE226769/normalized/test_target_3.csv.gz', compression='gzip')

12 4
['0HR', '2HR', '4HR', '6HR']


In [8]:
rep_set = set()
time_set = set()
formated_name_list = []
for name in df_4_4.columns:
    name_segments = name.split('_')
    if (len(name_segments) == 2):
        rep_name = name_segments[0]
        time_name = name_segments[1]
    elif (len(name_segments) == 3):
        rep_name = name_segments[0] + '~' + name_segments[2]
        time_name = name_segments[1]
    elif (len(name_segments) == 4):
        rep_name = name_segments[0] + '~' + name_segments[1] + '~' + name_segments[3]
        time_name = name_segments[2]
    rep_set.add(rep_name)
    time_set.add(time_name)
    formated_name_list.append(rep_name+'@'+time_name)   
print(len(rep_set), len(time_set))
df_4_4.columns = formated_name_list
normalized_df = df_4_4.apply(stats.zscore, axis=0)
df_split_4 = train_test_source_target_split(normalized_df, time_set, rep_set)

df_split_4[0].to_csv('./GSE226769/normalized/train_source_4.csv.gz', compression='gzip')
df_split_4[1].to_csv('./GSE226769/normalized/train_target_4.csv.gz', compression='gzip')
df_split_4[2].to_csv('./GSE226769/normalized/test_source_4.csv.gz', compression='gzip')
df_split_4[3].to_csv('./GSE226769/normalized/test_target_4.csv.gz', compression='gzip')

24 4
[' 0HR', ' 2HR', ' 4HR', ' 6HR']


In [9]:
normalized_df_1=df_1.apply(stats.zscore, axis=0)
normalized_df_2=df_2.apply(stats.zscore, axis=0)
normalized_df_3=df_3.apply(stats.zscore, axis=0)

normalized_df_1 = normalized_df_1.loc[common_genes]
normalized_df_2 = normalized_df_2.loc[common_genes]
normalized_df_3 = normalized_df_3.loc[common_genes]

In [10]:
train_source_df_1 = normalized_df_1.iloc[:, [0,1,2,6,7,8]]
train_target_df_1 = normalized_df_1.iloc[:, [1,2,3,7,8,9]]
test_source_df_1 = normalized_df_1.iloc[:, [3,4,9,10]]
test_target_df_1 = normalized_df_1.iloc[:, [4,5,10,11]]

train_source_df_2 = normalized_df_2.iloc[:, [0,1,2]]
train_target_df_2 = normalized_df_2.iloc[:, [1,2,3]]
test_source_df_2 = normalized_df_2.iloc[:, [3,4]]
test_target_df_2 = normalized_df_2.iloc[:, [4,5]]

train_source_df_3 = normalized_df_3.iloc[:, :-6]
train_target_df_3 = normalized_df_3.iloc[:, 1:-5]
test_source_df_3 = normalized_df_3.iloc[:, -6:-1]
test_target_df_3 = normalized_df_3.iloc[:, -5:]

In [12]:
train_source_df_1.to_csv('./GSE145936/normalized/train_source.csv.gz', compression='gzip')
train_target_df_1.to_csv('./GSE145936/normalized/train_target.csv.gz', compression='gzip')
test_source_df_1.to_csv('./GSE145936/normalized/test_source.csv.gz', compression='gzip')
test_target_df_1.to_csv('./GSE145936/normalized/test_target.csv.gz', compression='gzip')

train_source_df_2.to_csv('./GSE153609/normalized/train_source.csv.gz', compression='gzip')
train_target_df_2.to_csv('./GSE153609/normalized/train_target.csv.gz', compression='gzip')
test_source_df_2.to_csv('./GSE153609/normalized/test_source.csv.gz', compression='gzip')
test_target_df_2.to_csv('./GSE153609/normalized/test_target.csv.gz', compression='gzip')

train_source_df_3.to_csv('./GSE168699/normalized/train_source.csv.gz', compression='gzip')
train_target_df_3.to_csv('./GSE168699/normalized/train_target.csv.gz', compression='gzip')
test_source_df_3.to_csv('./GSE168699/normalized/test_source.csv.gz', compression='gzip')
test_target_df_3.to_csv('./GSE168699/normalized/test_target.csv.gz', compression='gzip')

