In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import re
import glob
import os
from functools import cmp_to_key, partial
from itertools import product
from scipy import stats
import gzip
from Bio import SeqIO


# regex for number extraction from string
number_pattern =  r'(-?(?:0|[1-9]\d*)(?:\.\d+)?(?:[eE][+-]?\d+)?)'


In [2]:
exp_df_1 = pd.read_csv('./GSE171975_Liver_BMALKO_reads.csv', index_col=0)
exp_df_2 = pd.read_csv('./GSE171975_R370_Liver_WT_reads.csv', index_col=0)
exp_df_3 = pd.read_csv('./GSE171975_R389_Liver_PerDKO_reads.csv', index_col=0)
exp_df_3.shape

(12176, 46)

In [3]:
common_genes = list(set(exp_df_1.index).intersection(set(exp_df_2.index), set(exp_df_3.index)))

In [4]:
exp_df = pd.concat([exp_df_1.loc[common_genes], exp_df_2.loc[common_genes], exp_df_3.loc[common_genes]], axis=1)

In [5]:
formated_names = []
exp_dict = {}
for name in exp_df.columns:
    time_pattern = r'(.*)_CT(\d*)(_.*)\.raw'
    match = re.search(time_pattern, name)
    time = match.group(2)
    rep = match.group(1) + match.group(3)
    if rep in exp_dict:
        exp_dict[rep].append(int(time))
    else:
        exp_dict[rep] = []
        exp_dict[rep].append(int(time))
    formated_names.append(rep+'@'+time)

exp_df.columns = formated_names

In [6]:
for key in exp_dict.keys():
    exp_dict[key].sort()

In [7]:
gene_meta_df = pd.read_csv('../MGI_Gene_Model_Coord.rpt', sep='\t', index_col=2)
gene_meta_df['NCBI_length'] = gene_meta_df['9. NCBI gene end'] - gene_meta_df['8. NCBI gene start']
gene_meta_df['Ensembl_length'] = gene_meta_df['14. Ensembl gene end'] - gene_meta_df['13. Ensembl gene start']

known_genes = list(set(exp_df.index).intersection(set(gene_meta_df.index)))
known_genes = gene_meta_df.loc[known_genes].loc[~gene_meta_df.loc[known_genes]['NCBI_length'].isna()].index

gene_length_list = [gene_meta_df.loc[gene]['NCBI_length'] for gene in known_genes]

normalized_df = exp_df.loc[known_genes]
normalized_df = (normalized_df+0.001)*1000
normalized_df = normalized_df.div(gene_meta_df.loc[known_genes]['NCBI_length'], axis=0)
normalized_df = normalized_df.apply(stats.zscore, axis=0)

In [8]:
train_source_idx = []
train_target_idx = []
test_source_idx = []
test_target_idx = []

In [9]:
for key in exp_dict.keys():
    exp_list = [key+'@'+str(time) for time in exp_dict[key]]
    if len(exp_dict[key]) > 10:
        train_source_idx.extend(exp_list[:-5])
        train_target_idx.extend(exp_list[1:-4])
        test_source_idx.extend(exp_list[:-5:-1])
        test_target_idx.extend(exp_list[:-4])
    else:
        train_source_idx.extend(exp_list[:-4])
        train_target_idx.extend(exp_list[1:-3])
        test_source_idx.extend(exp_list[:-4:-1])
        test_target_idx.extend(exp_list[:-3])

In [12]:
normalized_df[train_source_idx].to_csv('./normalized/train_source.csv.gz', compression='gzip')
normalized_df[train_target_idx].to_csv('./normalized/train_target.csv.gz', compression='gzip')
normalized_df[test_source_idx].to_csv('./normalized/test_source.csv.gz', compression='gzip')
normalized_df[test_target_idx].to_csv('./normalized/test_target.csv.gz', compression='gzip')