Scripts for TPM normalization in arabidopsis sequence data

In [1]:
import pandas as pd
import numpy as np
from Bio import SeqIO
from scipy import stats
import gzip
import re

# regex for number extraction from string
number_pattern =  r'(-?(?:0|[1-9]\d*)(?:\.\d+)?(?:[eE][+-]?\d+)?)'

In [2]:
# get gene length
gene_id_list = []
length_list = []
gene_set = set()

fasta_sequences = SeqIO.parse(gzip.open("./Araport11_seq_20220914.gz", "rt"),'fasta')
for fasta in fasta_sequences:
    gene_set.add(fasta.id.split('.')[0])
    gene_id_list.append(fasta.id.split('.')[0])
    length_list.append(len(str(fasta.seq)))

gene_length_dict = dict.fromkeys(list(gene_set))
for name, length in zip(gene_id_list, length_list):
    if (gene_length_dict[name] == None): gene_length_dict[name] = []
    gene_length_dict[name].append(length)

gene_id_list = []
length_list = []
for name in gene_length_dict.keys():
    gene_id_list.append(name)
    length_list.append(np.median(gene_length_dict[name]))

out_df = pd.DataFrame(index=gene_id_list)
out_df['gene_length'] = length_list
out_df.to_csv('./arabidopsis_gene_length.csv')

In [3]:
# training/testing splits for target/source, finalize with TPM, z-score
source_exp_df = pd.read_csv('./GSE97500/ts_source_exp.csv', index_col=0)
target_exp_df = pd.read_csv('./GSE97500/ts_target_exp.csv', index_col=0)

common_genes = list(set(out_df.index).intersection(set(source_exp_df.index), set(target_exp_df.index)))

train_idx = []
test_idx = []
for i, name in enumerate(source_exp_df.columns):
    if (i%8 < 6):
        train_idx.append(name)
    else:
        test_idx.append(name)
train_source_df = source_exp_df[train_idx].loc[common_genes]
test_source_df = source_exp_df[test_idx].loc[common_genes]

train_idx = []
test_idx = []
for i, name in enumerate(target_exp_df.columns):
    if (i%8 < 6):
        train_idx.append(name)
    else:
        test_idx.append(name)
train_target_df = target_exp_df[train_idx].loc[common_genes]
test_target_df = target_exp_df[test_idx].loc[common_genes]

train_source_df = train_source_df.div(out_df.loc[common_genes]['gene_length'], axis=0)
test_source_df = test_source_df.div(out_df.loc[common_genes]['gene_length'], axis=0)
train_target_df = train_target_df.div(out_df.loc[common_genes]['gene_length'], axis=0)
test_target_df = test_target_df.div(out_df.loc[common_genes]['gene_length'], axis=0)


train_source_df = train_source_df.apply(stats.zscore, axis=0)
train_target_df = train_target_df.apply(stats.zscore, axis=0)
test_source_df = test_source_df.apply(stats.zscore, axis=0)
test_target_df = test_target_df.apply(stats.zscore, axis=0)

train_source_df.to_csv('./GSE97500/normalized/train_source.csv.gz', compression='gzip')
test_source_df.to_csv('./GSE97500/normalized/test_source.csv.gz', compression='gzip')
train_target_df.to_csv('./GSE97500/normalized/train_target.csv.gz', compression='gzip')
test_target_df.to_csv('./GSE97500/normalized/test_target.csv.gz', compression='gzip')