In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import re
import glob
import os
from functools import cmp_to_key, partial
from itertools import product
from scipy import stats
import gzip
from Bio import SeqIO


# regex for number extraction from string
number_pattern =  r'(-?(?:0|[1-9]\d*)(?:\.\d+)?(?:[eE][+-]?\d+)?)'


In [2]:
exp_df = pd.read_csv('./GSE151173_Gene_RAW_Table.txt.gz', compression='gzip', sep='\t', index_col=0)
exp_df.head(2)

Unnamed: 0,KO1ZT02,KO1ZT06,KO1ZT10,KO1ZT14,KO1ZT18,KO1ZT22,KO2ZT02,KO2ZT06,KO2ZT10,KO2ZT14,...,PO5ZT10,PO5ZT14,PO5ZT18,PO5ZT22,PO6ZT02,PO6ZT06,PO6ZT10,PO6ZT14,PO6ZT18,PO6ZT22
Gnai3,95,88,164,129,166,110,207,154,189,230,...,177,183,112,181,174,227,176,197,119,926
Pbsn,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
rep_set = set()
time_set = set()
for name in exp_df.columns:
    name_splits = name.split('ZT')
    rep_set.add(name_splits[0])
    time_set.add(name_splits[1])
rep_list = list(rep_set)
time_list = list(time_set)
time_list.sort()

In [4]:
train_source_idx = [i[0]+'ZT'+str(i[1]) for i in product(rep_list, time_list[0:3])]
train_target_idx = [i[0]+'ZT'+str(i[1]) for i in product(rep_list, time_list[1:4])]
test_source_idx = [i[0]+'ZT'+str(i[1]) for i in product(rep_list, time_list[-3:-1])]
test_target_idx = [i[0]+'ZT'+str(i[1]) for i in product(rep_list, time_list[-2:])]

In [5]:
gene_meta_df = pd.read_csv('../MGI_Gene_Model_Coord.rpt', sep='\t', index_col=2)
gene_meta_df['NCBI_length'] = gene_meta_df['9. NCBI gene end'] - gene_meta_df['8. NCBI gene start']
gene_meta_df['Ensembl_length'] = gene_meta_df['14. Ensembl gene end'] - gene_meta_df['13. Ensembl gene start']

known_genes = list(set(exp_df.index).intersection(set(gene_meta_df.index)))
known_genes = gene_meta_df.loc[known_genes].loc[~gene_meta_df.loc[known_genes]['NCBI_length'].isna()].index

gene_length_list = [gene_meta_df.loc[gene]['NCBI_length'] for gene in known_genes]

normalized_df = exp_df.loc[known_genes]
normalized_df = (normalized_df+0.001)*1000
normalized_df = normalized_df.div(gene_meta_df.loc[known_genes]['NCBI_length'], axis=0)
normalized_df = normalized_df.apply(stats.zscore, axis=0)

In [6]:
normalized_df[train_source_idx].to_csv('./normalized/train_source.csv.gz', compression='gzip')
normalized_df[train_target_idx].to_csv('./normalized/train_target.csv.gz', compression='gzip')
normalized_df[test_source_idx].to_csv('./normalized/test_source.csv.gz', compression='gzip')
normalized_df[test_target_idx].to_csv('./normalized/test_target.csv.gz', compression='gzip')