Scripts for regression experiments on mouse

In [2]:
import pandas as pd
from itertools import product
import re


from scipy import stats

# regex for number extraction from string
number_pattern =  r'(-?(?:0|[1-9]\d*)(?:\.\d+)?(?:[eE][+-]?\d+)?)'


In [3]:
def normalize(df, out_put_dir):
    time_set = set()
    rep_set = set()
    formated_names = []
    exp_dict = {}
    for name in df.columns:
        time = int(name.split('_')[-1])
        rep_name = '_'.join(name.split('_')[:-1])
        rep_set.add(rep_name)
        time_set.add(time)
        if rep_name in exp_dict:
            exp_dict[rep_name].append(time)
        else:
            exp_dict[rep_name] = []
            exp_dict[rep_name].append(time)
        formated_names.append(rep_name+'@'+name.split('_')[-1])

    df.columns = formated_names
    rep_list = list(rep_set)
    time_list = list(time_set)
    time_list.sort()

    train_source_idx = []
    train_target_idx = []
    test_source_idx = []
    test_target_idx = []

    for key in exp_dict.keys():
        exp_list = [key+'@'+str(time) for time in exp_dict[key]]
        if len(exp_dict[key]) > 10:
            train_source_idx.extend(exp_list[:-5])
            train_target_idx.extend(exp_list[1:-4])
            test_source_idx.extend(exp_list[-5:-1])
            test_target_idx.extend(exp_list[-4:])
        else:
            train_source_idx.extend(exp_list[:-4])
            train_target_idx.extend(exp_list[1:-3])
            test_source_idx.extend(exp_list[-4:-1])
            test_target_idx.extend(exp_list[-3:])
        
    normalized_df = df.apply(stats.zscore, axis=0)
    normalized_df[train_source_idx].to_csv('./{}/normalized/train_source.csv.gz'.format(out_put_dir), compression='gzip')
    normalized_df[train_target_idx].to_csv('./{}/normalized/train_target.csv.gz'.format(out_put_dir), compression='gzip')
    normalized_df[test_source_idx].to_csv('./{}/normalized/test_source.csv.gz'.format(out_put_dir), compression='gzip')
    normalized_df[test_target_idx].to_csv('./{}/normalized/test_target.csv.gz'.format(out_put_dir), compression='gzip')

In [7]:
exp_df = pd.read_csv("./GSE221103/GSE221103_Shep_Sknas_N-MYC-ER_RNA-Seq_TPM_Circadian.txt.gz", compression='gzip', index_col=1, sep='\t')
exp_df = exp_df.drop(columns=['GENEID'])
exp_df = exp_df[~exp_df.index.duplicated(keep='first')]
normalize(exp_df, 'GSE221103')

In [8]:
exp_df = pd.read_csv("./GSE221173/GSE221173_MYC-ER_RNA-Seq_TPM_Circadian_U2OS.txt.gz", compression='gzip', index_col=1, sep='\t')
exp_df = exp_df.drop(columns=['GENEID'])
exp_df = exp_df[~exp_df.index.duplicated(keep='first')]
normalize(exp_df, 'GSE221173')